{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## 计算向量，统计结果等\n",
    "### 获得基于第二阶段模型所得标问向量"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "from task_sentence_embedding_FinanceFAQ_step2_1 import model\n",
    "from config import *\n",
    "\n",
    "# get list\n",
    "q_std_list = pd.read_csv(q_std_file, sep=\"\\t\", names=['c']).c.tolist()\n",
    "q_corpus = pd.read_csv(q_corpus_file, sep=\"\\t\", names=['c']).c.tolist()\n",
    "\n",
    "# get embeddings\n",
    "q_std_sentence_embeddings = model.encode(q_std_list)\n",
    "print('保存二阶段标准问向量：', sec_q_std_vectors_file)\n",
    "np.save(sec_q_std_vectors_file, q_std_sentence_embeddings)\n",
    "q_corpus_sentence_embeddings = model.encode(q_corpus)\n",
    "print('保存二阶段所有语料向量：', sec_q_corpus_vectors_file)\n",
    "np.save(sec_q_corpus_vectors_file, q_corpus_sentence_embeddings)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 获得所有待测数据第一阶段模型预测结果"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from config import *\n",
    "from utils import *\n",
    "from task_sentence_embedding_FinanceFAQ_step1_1 import model as model1\n",
    "\n",
    "path_list = fst_eval_path_list\n",
    "\n",
    "# 读取q_std、q_corpus语料和向量\n",
    "q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, fst_q_std_vectors_file, q_corpus_file, fst_q_corpus_vectors_file)\n",
    "\n",
    "for i, input_path in enumerate(path_list):\n",
    "    print(f'开始评估新语料: {i}'.center(120, '='))\n",
    "    df_eval = pd.read_csv(input_path, sep=\"\\t\")\n",
    "    df_eval = df_eval[~pd.isna(df_eval.q_sim)]\n",
    "    output_path = input_path[:-4] + '_result.tsv'\n",
    "    print('input_path: ', input_path, 'output_path: ', output_path)\n",
    "\n",
    "    print(\"目标语料数量：\", df_eval.shape, '标问数量：', df_eval.q_std.nunique(), '相似问数量：',\n",
    "          df_eval.q_sim.nunique(), '标语料去重后数量', df_eval.drop_duplicates([\"q_std\", \"q_sim\"]).shape[0])\n",
    "\n",
    "    ## v1 对于都是有一个是小量的情况下\n",
    "    df_eval = cal_performance(model1, q_all_sentence_embeddings_dict, q_std_sentence_embeddings, q_std_list, df_eval, K=10)\n",
    "    df_eval.to_csv(output_path, index=None, sep=\"\\t\")\n"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 获得所有待测数据第二阶段模型预测结果"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import os\n",
    "import torch\n",
    "from task_sentence_embedding_FinanceFAQ_step2_1 import model as model2\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from config import *\n",
    "\n",
    "path_list = sec_eval_path_list\n",
    "\n",
    "# 读取q_std、q_corpus语料和向量\n",
    "q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, sec_q_std_vectors_file, q_corpus_file, sec_q_corpus_vectors_file)\n",
    "# 标问和向量的映射\n",
    "corpus_sentence_embeddings_dict = {q_std_list[i]: q_std_sentence_embeddings[i] for i in range(0, len(q_std_list))}\n",
    "\n",
    "for i, input_path in enumerate(path_list):\n",
    "    print(f'开始评估新语料: {i}'.center(120, '='))\n",
    "    df_eval = pd.read_csv(input_path, sep=\"\\t\")\n",
    "    output_path = input_path[:-4] + '_result.tsv'\n",
    "    print('input_path: ', input_path, 'output_path: ', output_path)\n",
    "\n",
    "    texts = df_eval.q_sim.tolist()\n",
    "    texts_in = [v for v in texts if v in q_all_sentence_embeddings_dict.keys()]\n",
    "    texts_out = [v for v in texts if v not in q_all_sentence_embeddings_dict.keys()]\n",
    "    texts_out_embeddings = model2.encode(texts_out) if texts_out else []\n",
    "    texts_embeddings_dict_1 = {texts_in[i]: q_all_sentence_embeddings_dict[texts_in[i]] for i in range(0, len(texts_in))}\n",
    "    texts_embeddings_dict_2 = {texts_out[i]: texts_out_embeddings[i] for i in range(0, len(texts_out))}\n",
    "    texts_embeddings_dict = {**texts_embeddings_dict_1, **texts_embeddings_dict_2}\n",
    "    print('目标语料编码数量：——>', len(texts_embeddings_dict))\n",
    "\n",
    "    def get_sec_result(text, std_texts):\n",
    "        '''预测模型2的结果\n",
    "        '''\n",
    "        a_text_embeddings = texts_embeddings_dict[text]  # 获取改相似问在模型2中的向量\n",
    "        b_text_embeddings = np.array([corpus_sentence_embeddings_dict[v] for v in std_texts])  # 拿到模型1召回的候选标问在模型2中的向量\n",
    "        sims_with_std = cos_sim4matrix_2(a_text_embeddings, b_text_embeddings).reshape(-1)\n",
    "        sort_idx = np.argsort(-sims_with_std).tolist()\n",
    "        intents_sort = [std_texts[idx] for idx in sort_idx]\n",
    "        sims_values = [sims_with_std[idx] for idx in sort_idx]\n",
    "        result = list(zip(intents_sort, sims_values))\n",
    "        return (result)\n",
    "\n",
    "    # 模型1预测结果\n",
    "    df_eval['q_std_pred_list_v1'] = df_eval.q_std_pred_list_v1.apply(lambda v: eval(v))\n",
    "\n",
    "    # 模型2预测结果\n",
    "    df_eval['q_std_pred_list_2'] = df_eval.apply(lambda row: get_sec_result(row['q_sim'], row['q_std_pred_list_v1']), axis=1)\n",
    "\n",
    "    df_eval['q_std_pred_list_2_v1'] = df_eval.q_std_pred_list_2.apply(lambda v: [k[0] for k in v])\n",
    "    df_eval['q_std_pred_list_2_v2'] = df_eval.q_std_pred_list_2.apply(lambda v: [k[1] for k in v])\n",
    "    df_eval['q_std_pred_2'] = df_eval.q_std_pred_list_2_v1.apply(lambda v: v[0])\n",
    "    df_eval['prob_2'] = df_eval.q_std_pred_list_2_v2.apply(lambda v: v[0])\n",
    "\n",
    "    df_eval['r1'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:1] else 0, axis=1)\n",
    "    df_eval['r3'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:3] else 0, axis=1)\n",
    "    df_eval['r5'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:5] else 0, axis=1)\n",
    "    df_eval['r10'] = df_eval.apply(lambda row: 1 if row['q_std'] in row['q_std_pred_list_2_v1'][0:10] else 0, axis=1)\n",
    "\n",
    "    # 扣除不包含的标准问\n",
    "    print('目标语料准确率：——>')\n",
    "    print(df_eval.shape)\n",
    "    df_1 = df_eval\n",
    "    print('第一阶段整体准确率', df_1.t1.sum() / df_1.shape[0], df_1.t3.sum() / df_1.shape[0], df_1.t5.sum() / df_1.shape[0], df_1.t10.sum() / df_1.shape[0])\n",
    "    df_2 = df_eval[df_eval.t10 == 1]\n",
    "    print('第二阶段整体准确率', df_2.r1.sum() / df_2.shape[0], df_2.r3.sum() / df_2.shape[0], df_2.r5.sum() / df_2.shape[0], df_2.r10.sum() / df_2.shape[0])\n",
    "    df_3 = df_eval\n",
    "    print('整体准确率', df_3.r1.sum() / df_3.shape[0], df_3.r3.sum() / df_3.shape[0], df_3.r5.sum() / df_3.shape[0], df_3.r10.sum() / df_3.shape[0])\n",
    "\n",
    "    ##扣除不包含的标准问\n",
    "    print('目标语料准确率[有效标问]：——>')\n",
    "    df_k_need = df_eval[df_eval.ifin == 1]\n",
    "    print(df_k_need.shape)\n",
    "    df_1 = df_k_need\n",
    "    print('第一阶段整体准确率', df_1.t1.sum() / df_1.shape[0], df_1.t3.sum() / df_1.shape[0], df_1.t5.sum() / df_1.shape[0], df_1.t10.sum() / df_1.shape[0])\n",
    "    df_2 = df_k_need[df_k_need.t10 == 1]\n",
    "    print('第二阶段整体准确率', df_2.r1.sum() / df_2.shape[0], df_2.r3.sum() / df_2.shape[0], df_2.r5.sum() / df_2.shape[0], df_2.r10.sum() / df_2.shape[0])\n",
    "    df_3 = df_k_need\n",
    "    print('整体准确率', df_3.r1.sum() / df_3.shape[0], df_3.r3.sum() / df_3.shape[0], df_3.r5.sum() / df_3.shape[0], df_3.r10.sum() / df_3.shape[0])\n",
    "    df_eval.to_csv(output_path, index=None, sep=\"\\t\")\n"
   ],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.8.8 64-bit ('base': conda)"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "e42634819b8c191a5d07eaf23810ff32516dd8d3875f28ec3e488928fbd3c187"
   }
  },
  "interpreter": {
   "hash": "509cf8fb3e64af7327dbc287206db89f13b65f7dad389d82b165e29388b2e60b"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}